#!/bin/bash

# Copyright (c) 2008-2015, Max Krasnyansky <max.krasnyansky@gmail.com> 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# 
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
# 
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.##

SYSCTL="/sbin/sysctl -q -e -w"

CPUSETDIR=/sys/fs/cgroup/cpuset

# Setup the environment
init_env()
{
	# Init cpusets
	if [ ! -d $CPUSETDIR -o ! -f $CPUSETDIR/cpuset.cpus ]; then
		echo $CPUSETDIR does not exist or cpuset fs is not mounted
		exit 1
	fi
 
	# All CPUs in the system
	ALL_CPUS_MASK=$(bitops --fmt=lX < $CPUSETDIR/cpuset.cpus)
}

# Task move helper
#  $1 - cpuset name (. for root)
#  $2 - cpumask
do_move_tasks()
{
	cpuset=$1
	mask=$2

	# Iterate over all tasks in the system
	for p in /proc/[0-9]*; do
		mapfile -t stat <$p/status || continue

		pid=
		userspace=0
		for ((i=0; i < ${#stat[*]}; i++)); do
			case ${stat[$i]} in
				Pid:*)    pid=${stat[$i]#*:[[:space:]]} ;;
				VmSize:*) userspace=1 ;;
			esac
		done

		if [ $userspace -eq 1 ]; then
			echo $pid >$CPUSETDIR/$cpuset/cgroup.procs
		else
			taskset -p $mask $pid &>/dev/null
		fi
	done 2>/dev/null
}

# Move tasks into the specified partition.
#  $1 - cpuset name (. for root)
#  $2 - cpumask
# Userspace tasks are moved into the apropriate cpuset.
# Kernel tasks simply get their cpumask adjusted because 
# most kernel threads are ignored by cpusets anyways.
move_tasks()
{
	cpuset=$1
	mask=$2

	[ $cpuset = / ] && cpuset=.

	# Set default kthread cpumask (if supported)
	$SYSCTL "kernel.default_kthread_cpumask=$mask"

	# Run this twice to handle potential race where first iteration
	# might miss new tasks created after we scan the PID space. 
	do_move_tasks $cpuset $mask
	do_move_tasks $cpuset $mask
}

# Create partitions (GPP and ISP)
#  $1 - List of ISP CPUs
#  $2 - List of ISP Memory nodes
#  $3 - List of GPP Memory nodes
create_partitions()
{
	ISP_CPUS_LIST=$1
	ISP_MEMS_LIST=${2:-0}
	GPP_MEMS_LIST=${3:-0}

	if [ "$ISP_CPUS_LIST" = "" ]; then
		echo "ISP CPUs list (--isp-cpus) was not specified."
		exit 1
	fi

	ISP_CPUS_MASK=$(echo $ISP_CPUS_LIST | bitops --fmt=lX)
	GPP_CPUS_MASK=$(echo $ALL_CPUS_MASK | bitops --andnot $ISP_CPUS_MASK)
	GPP_CPUS_LIST=$(echo $GPP_CPUS_MASK | bitops --fmt=xL)

	ISP_EACH_CPU=$(echo $ISP_CPUS_MASK | bitops --fmt=xE)

	# Do not start twice to avoid conflicts.
	# For example we do not want to bring CPUs on/offline 
	# if stopmachine is disabled on them.
	if [ -d $CPUSETDIR/gpp ]; then
		echo "System partitioning is already setup."
		return 1
	fi

	# Set default IRQ affinity to GPP cpus
	echo $GPP_CPUS_MASK > /proc/irq/default_smp_affinity

	# Set affinity for all active IRQs
	for i in /proc/irq/[0-9]*; do
		echo $GPP_CPUS_MASK > $i/smp_affinity 2>/dev/null
	done

	# Constrain unbounded workqueues to GPP cpus
	for i in /sys/bus/workqueue/devices/*; do 
		[ -f $i/cpumask ] && echo $GPP_CPUS_MASK > $i/cpumask
	done

	# Disable systemwide load balancing
	echo 0 > $CPUSETDIR/cpuset.sched_load_balance

	# Bring ISP cpus offline
	for i in $ISP_EACH_CPU; do
		echo 0 > /sys/devices/system/cpu/cpu$i/online
	done

	# Create gpp cpuset
	mkdir $CPUSETDIR/gpp
	echo $GPP_CPUS_LIST > $CPUSETDIR/gpp/cpuset.cpus
	echo $GPP_MEMS_LIST > $CPUSETDIR/gpp/cpuset.mems
	echo 0 > $CPUSETDIR/gpp/cpuset.sched_load_balance

	# Move tasks into GPP
	move_tasks gpp $GPP_CPUS_MASK

	# Bring ISP cpus online.
	# Put each CPU into a separate cpuset, otherwise they 
	# endup in the same root domain which causes rt scheduler
	# lock contention.
	for i in $ISP_EACH_CPU; do
		echo 1 > /sys/devices/system/cpu/cpu$i/online
		mkdir $CPUSETDIR/cpu$i
		echo $i > $CPUSETDIR/cpu$i/cpuset.cpus
		echo $ISP_MEMS_LIST > $CPUSETDIR/cpu$i/cpuset.mems
	done

	# Move any tasks that might have started/changed due to
	# CPU onlining.
	move_tasks gpp $GPP_CPUS_MASK

	# Disable stopmachine (if supported)
	$SYSCTL "kernel.stopmachine_disabled=1"

	# Restrict pagedrain to GPP cpus (if supported)
	$SYSCTL "vm.pagedrain_cpumask=$GPP_CPUS_MASK"

	# Disable scheduler RT throttling 
	$SYSCTL "kernel.sched_rt_runtime_us=-1"

	# Disable softlockup detection.
	# Ideally this should be a cpumask 
	$SYSCTL "kernel.softlockup_thresh=-1"

	# Enable balancing in GPP.
	# We do it as the last step to avoid redundant domain 
	# rebuilds.
	echo 1 > $CPUSETDIR/gpp/cpuset.sched_load_balance
}

# Destroy partitions (GPP and ISP)
destroy_partitions()
{
	# Enable stopmachine (if supported)
	$SYSCTL "kernel.stopmachine_disabled=0"

	# Unrestrict pagedrain (if supported)
	$SYSCTL "vm.pagedrain_cpumask=$ALL_CPUS_MASK"

	# Enable scheduler RT throttling
	$SYSCTL "kernel.sched_rt_runtime_us=950000"

	# Enable softlockup detection
	$SYSCTL "kernel.softlockup_thresh=60"

	# Remove gpp cpuset
	while [ -d $CPUSETDIR/gpp ]; do 
		move_tasks / $ALL_CPUS_MASK
		rmdir $CPUSETDIR/gpp &>/dev/null
	done

	# Remove per cpu cpusets
	for i in $CPUSETDIR/cpu[0-9]*; do
		rmdir $i
	done

	# Enable systemwide balancing
	echo 1 > $CPUSETDIR/cpuset.sched_load_balance

	# UnConstrain unbounded workqueues to GPP cpus
	for i in /sys/bus/workqueue/devices/*; do 
		[ -f $i/cpumask ] && echo $ALL_CPUS_MASK > $i/cpumask
	done

	# Set default IRQ affinity
	echo $ALL_CPUS_MASK > /proc/irq/default_smp_affinity

	# Set affinity for all active IRQs
	for i in /proc/irq/[0-9]*; do
		echo $ALL_CPUS_MASK > $i/smp_affinity 2>/dev/null
	done
}

# Check cpumask of all tasks in the system; print tasks with cpumask that doesn't
# match the expected mask 
# 	$1 - expected cpumask
check_tasks()
{
	cpumask_exp=$1

	echo "Checking all tasks: expected cpumask $cpumask_exp"

	for p in /proc/[0-9]*; do
		# Parse /proc/pid/status
		mapfile -t stat < $p/status || continue
	
		pid=
		name=
		cpumask=
		kthread=1
		for ((i=0; i < ${#stat[*]}; i++)); do
			case ${stat[$i]} in
				Pid:*)    pid=${stat[$i]#*:[[:space:]]}  ;;
				Name:*)   name=${stat[$i]#*:[[:space:]]} ;;
				Cpu*ed:*) cpumask=$(echo ${stat[$i]#*:[[:space:]]} | sed 's/^0*//g') ;;
				VmSize:*) kthread=0 ;;
			esac
		done

		# Parse /proc/pid/stat
		mapfile -t stat < <(tr ' ' '\n' < $p/stat)
		flags=${stat[8]}
		last_cpu=${stat[38]}
		((movable=($flags & 0x4000000)==0)) # linux/sched.h:#define PF_NO_SETAFFINITY 0x04000000

		extra="("
		[ $kthread -ne 0 ] && extra+="kthread,"
		[ $movable -eq 0 ] && extra+="not-movable,"
		extra=${extra%,}")"
	
		[ "$cpumask" != "$cpumask_exp" ] && echo -e "$name\t- pid:$pid $extra\tcpumask:$cpumask\tlast_cpu:$last_cpu"

	done | column -t
}

# Check cpumask of all IRQs in the system; print IRQs with cpumask that doesn't
# match the expected mask 
# 	$1 - expected cpumask
check_irqs()
{
	cpumask_exp=$1

	cpumask_dft=$(sed 's/^0*//g' /proc/irq/default_smp_affinity)

	if [ "$cpumask_dft" != "$cpumask_exp" ]; then
		echo "Default IRQ affinity mask is incorrect: $cpumask_dft expected $cpumask_exp"
		exit 1
	fi

	echo "Checking all IRQs: expected cpumask $cpumask_exp"

	for p in /proc/irq/[0-9]*; do
		cpumask=$(sed 's/^0*//g' $p/smp_affinity)
		irq=${p##*/}
		irq_name=$(for f in ${p}/*; do [ -d "$f" ] && printf "${f##*/},"; done)
		[ "$cpumask" != "$cpumask_exp" ] && echo -e "${irq_name%,}($irq)\t- \tcpumask:$cpumask expected $cpumask_exp"
	done | column -t -s "	"
}

# Create partitions (GPP and ISP)
#  $1 - List of ISP CPUs
#  $2 - List of ISP Memory nodes
#  $3 - List of GPP Memory nodes
check_partitions()
{
	ISP_CPUS_LIST_EXP=$1
	ISP_MEMS_LIST_EXP=$2
	GPP_MEMS_LIST_EXP=$3

	# Detect GPP setup
	if [ ! -d $CPUSETDIR/gpp ]; then
		echo "GPP cpuset doesn't exist. Looks like the system is not partitioned."
		exit 1
	fi

	GPP_CPUS_LIST=$(<$CPUSETDIR/gpp/cpuset.cpus)
	GPP_MEMS_LIST=$(<$CPUSETDIR/gpp/cpuset.mems)
	GPP_CPUS_MASK=$(echo $GPP_CPUS_LIST | bitops --fmt=lX)
	ISP_CPUS_MASK=$(echo $ALL_CPUS_MASK | bitops --andnot $GPP_CPUS_MASK)
	ISP_CPUS_LIST=$(echo $ISP_CPUS_MASK | bitops --fmt=xL)
	ISP_EACH_CPU=$(echo $ISP_CPUS_MASK | bitops --fmt=xE)

	if [ "$ISP_CPUS_LIST_EXP" != "" -a "$ISP_CPUS_LIST_EXP" != "$ISP_CPUS_LIST" ]; then
		echo "ISP CPUs list $ISP_CPUS_LIST is incorrect, expected $ISP_CPUS_LIST"
		exit 1
	fi

	# Check cpusets for each CPU in ISP
	for i in $ISP_EACH_CPU; do
		if [ ! -d $CPUSETDIR/cpu$i ]; then
			echo "Missing cpuset for cpu$i"
			exit 1
		fi
		ISP_MEMS_LIST=$(<$CPUSETDIR/cpu$i/cpuset.mems)
	done

	echo "Detected system partitioning:"
	echo -e "\tisp-cpus: $ISP_CPUS_LIST isp-mems: $ISP_MEMS_LIST"
	echo -e "\tgpp-cpus: $GPP_CPUS_LIST gpp-mems: $GPP_MEMS_LIST"

	echo
	check_tasks $GPP_CPUS_MASK

	echo
	check_irqs  $GPP_CPUS_MASK
}

usage()
{
	echo "Usage: syspart [options] <action>"
	echo "Options:"
	echo -e "\t--isp-cpus CPUs\tList of CPUs assigned to ISP (isolated partition)"
	echo -e "\t--isp-mems MEMs\tList of memory nodes assigned to ISP"
	echo -e "\t--gpp-mems MEMs\tList of memory nodes assigned to GPP (general purpose partition)"
	echo "Actions:"
	echo -e "\t--create\tCreate partitions"
	echo -e "\t--destroy\tDestroy partitions"
	echo -e "\t--check\tVerify system partitions (tasks, IRQs, etc)"
}

opt_isp_cpus=
opt_isp_mems=
opt_gpp_mems=

parse_opts()
{
	mode=""
	while test $# != 0; do
  		case $1 in
  		--*=*)
  			option=$(expr "X$1" : 'X\([^=]*\)=')
  			optarg=$(expr "X$1" : 'X[^=]*=\(.*\)')
  			nextopt=:
  			;;
  		*)
  			option=$1
  			optarg=$2
  			nextopt=shift
  			;;
  		esac

		case $option in
		--isp-cpus)
			opt_isp_cpus=$optarg
			$nextopt
			;;
		--isp-mems)
			opt_isp_mems=$optarg
			$nextopt
			;;
		--gpp-mems)
			opt_gpp_mems=$optarg
			$nextopt
			;;

		--create)
			mode="create"
			;;
		--destroy)
			mode="destroy"
			;;
		--check)
			mode="check"
			;;
		--help)
			usage
			exit 0
			;;
		*)
			usage
			exit 1
			;;
		esac

		shift
	done
}

# Parse command line options
parse_opts $*

# Parse cmdline
case $mode in
	create)
		init_env
		create_partitions "$opt_isp_cpus" "$opt_isp_mems" "$opt_gpp_mems"
		;;

	destroy)
		init_env
		destroy_partitions
		;;

	check)
		init_env 
		check_partitions "$opt_isp_cpus" "$opt_isp_mems" "$opt_gpp_mems"
		;;

	*)
		usage
		;;
esac
